home *** CD-ROM | disk | FTP | other *** search
- /* rxgrm.c */
- /* REGULAR EXPRESSION GRAMMAR */
- Goal
- -> RegularExpression <eof>
- RegularExpression
- -> AltExpr => rx_emit RXNT_ROOT
- AltExpr
- -> RegExpr
- -> AltExpr '|' RegExpr => rx_emit RXNT_ALT
- RegExpr
- -> RXpr
- -> ^ RXpr => rx_emit RXNT_RX_BEGIN
- -> RXpr $ => rx_emit RXNT_RX_END
- -> ^ RXpr $ => rx_emit RXNT_RX_BOTH
- RXpr
- -> Span
- -> RXpr Span => rx_emit RXNT_RX
- Span
- -> TermList
- -> Span .* => rx_emit RXNT_SPAN_0_FROM
- -> Span .+ => rx_emit RXNT_SPAN_1_FROM
- -> .* TermList => rx_emit RXNT_SPAN_0_TO
- -> .+ TermList => rx_emit RXNT_SPAN_1_TO
- -> Span .* TermList => rx_emit RXNT_SPAN_0
- -> Span .+ TermList => rx_emit RXNT_SPAN_1
- TermList
- -> Term
- -> TermList Term => rx_emit RXNT_TERM_LIST
- Term
- -> Primary * => rx_emit RXNT_STAR
- -> Primary + => rx_emit RXNT_PLUS
- -> Primary ? => rx_emit RXNT_BIN
- -> Primary
- -> . => rx_emit RXNT_ANY_C
- Primary
- -> Char
- -> <esc> => rx_emit RXNT_ESC_C
- -> ( AltExpr )
- -> [ ClassList ] => rx_emit RXNT_CLASS
- -> [^ ClassList ] => rx_emit RXNT_NOT_CLASS
- ClassList
- -> ClassItem
- -> ClassList ClassItem => rx_emit RXNT_CLASS_LIST
- ClassItem
- -> Char - Char => rx_emit RXNT_RANGE
- -> Char
- -> <esc> => rx_emit RXNT_ESC_C
- Char -> <c> => rx_emit RXNT_C
-
- /* rx.h */
- /* RX.H -- Header file for regular expression processing */
- #define SPACE ' '
- #define TAB '\t'
- #define NL '\n'
- #define NULLP (char *) 0
- #define AND &&
- #define OR ||
- #define BACK_SLASH '\\'
- #define isoctal(x) ((x) >= '0' AND (x) <= '7')
- typedef enum {
- ERROR_TOK, EOF_TOK, BEGIN_TOK, /* --- Token Types --- */
- END_TOK, ALT_TOK, CHAR_TOK,
- ANY_CHAR_TOK, ESCAPE_TOK, LPAREN_TOK,
- RPAREN_TOK, LBRACKET_TOK, NOT_TOK,
- RBRACKET_TOK, STAR_TOK, PLUS_TOK,
- BIN_TOK, DASH_TOK, SPAN0_TOK,
- SPAN1_TOK,
- RXNT_ROOT, /* --- Node Types --- */
- RXNT_RX, RXNT_RX_BEGIN, RXNT_RX_END,
- RXNT_RX_BOTH, RXNT_SPAN_0, RXNT_SPAN_1,
- RXNT_SPAN_0_TO, RXNT_SPAN_1_TO, RXNT_SPAN_0_FROM,
- RXNT_SPAN_1_FROM, RXNT_ALT, RXNT_TERM_LIST,
- RXNT_STAR, RXNT_PLUS, RXNT_BIN,
- RXNT_C, RXNT_ANY_C, RXNT_ESC_C,
- RXNT_RANGE, RXNT_CLASS, RXNT_CLASS_LIST,
- RXNT_NOT_CLASS
- } RX_SYM;
- typedef struct RX_NODE { /* --- REG EXPR NODE --- */
- RX_SYM type; /* Node type (see above) */
- char c; /* Character */
- char flag; /* Flags: */
- #define RX_BEGIN 0x01 /* Match beginning of field */
- #define RX_END 0x02 /* Match end of field */
- struct RX_NODE *lnode; /* Ptr to left-node */
- struct RX_NODE *rnode; /* Ptr to right-node */
- } RXNODE;
- #define RXNODE_0 (RXNODE *) 0
- #ifdef RXTRNL /* rx_main has this #define'd. So has definitions */
- char *token; /* Ptr to curr token. */
- char *input; /* Ptr to char after curr token. */
- char *T_beg; /* token-begin ptr */
- char *T_end; /* token-end ptr */
- RXNODE *rx_root = (RXNODE *) 0; /* Ptr to root node. */
- int rx_match_pos = -1; /* Position of match. */
- int rx_match_len = -1; /* Length of match. */
- #else /* Other modules just contain declarations */
- extern char *input, *token, *T_beg, *T_end;
- extern RXNODE *rx_root;
- extern int rx_match_pos, rx_match_len;
- #endif /* RXTRNL */
- int rx_parse (char *); /* --- Func Declarations --- */
- void rx_print_tree (RXNODE *, int);
- int rx_match (RXNODE *, char *);
- int rx_scan (void);
-
- /* rx_main.c */
- /* RX_MAIN.C -- Driver program for the compiler */
- #define RXTRNL
- #include <string.h>
- #include <stdio.h>
- #include <dir.h>
- #include <dos.h>
- #include <rx.h> /* globals defined here because RXTRNL is #define'd */
- int main (int argc, char **argv)
- {
- int i, rc, match_sw, lin;
- FILE *stream;
- char *rx, *cp;
- char txt [258];
- struct ffblk ffblk; /* For Turbo C "findfirst" & "findnext" functions. */
- if (argc != 3) {
- printf ("RX.EXE ----- Usage is:\n");
- printf (" RX \"<regexpr>\" <filemask>\n");
- printf ("Where <regexpr> is a regular expression, in double quotes.\n");
- printf ("And <filemask> is a DOS filename specification, wildcards OK.\n");
- exit (8);
- }
- rx = argv[1];
- i = rx_parse (rx); /* Parse the reg. expr. */
- if (i) {
- printf ("Parser error = %d\n", i);
- exit (4);
- }
- printf ("RegExpr ==> \"%s\"\n", rx);
- rc = findfirst (argv[2], &ffblk, 0);
- while (rc == 0) { /* Try to match lines */
- match_sw = 0;
- lin = 1;
- stream = fopen (ffblk.ff_name, "r");
- if (stream == (FILE *) 0) {
- printf ("RX.EXE ----- Error OPENing data file '%s'.\n",
- strupr(ffblk.ff_name));
- exit(4);
- }
- printf ("File: %s\n", strupr(ffblk.ff_name));
- txt[0]=0;
- while (fgets (txt+1, 256, stream) != 0) {
- if ((cp = strchr (txt+1, NL)) != NULLP)
- *cp = 0;
- /* ----- Attempt to match the reg.expr. ----- */
- if (rx_match (rx_root, txt+1)) {
- match_sw = 1;
- printf ("%3d| %s\n", lin, txt+1);
- }
- lin++;
- }
- fclose (stream);
- if (!match_sw)
- printf ("No match occurred.\n");
- rc = findnext (&ffblk);
- }
- return;
- }
-
- /* rx_scan.c */
- /* RX_SCAN -- Lexical analyzer for regular expressions */
- #include <ctype.h>
- #include <rx.h>
- static int in_brackets = 0; /* In class brackets ? */
- int rx_scan (void)
- {
- token=input;
- switch (*input++) {
- case 0:
- case NL: return EOF_TOK;
- case '^': if (in_brackets) return CHAR_TOK;
- return BEGIN_TOK;
- case '$': if (in_brackets) return CHAR_TOK;
- return END_TOK;
- case '.': if (in_brackets) return CHAR_TOK;
- if (*input=='*') { ++input; return SPAN0_TOK; }
- if (*input=='+') { ++input; return SPAN1_TOK; }
- return ANY_CHAR_TOK;
- case '[': ++in_brackets;
- if (*input=='^') { ++input; return NOT_TOK; }
- else { return LBRACKET_TOK; }
- case ']': --in_brackets; return RBRACKET_TOK;
- case '|': if (in_brackets) return CHAR_TOK;
- return ALT_TOK;
- case '(': if (in_brackets) return CHAR_TOK;
- return LPAREN_TOK;
- case ')': if (in_brackets) return CHAR_TOK;
- return RPAREN_TOK;
- case '*': if (in_brackets) return CHAR_TOK;
- return STAR_TOK;
- case '+': if (in_brackets) return CHAR_TOK;
- return PLUS_TOK;
- case '-': if (in_brackets
- AND isalnum(*(input-2))
- AND isalnum(*input))
- return DASH_TOK;
- return CHAR_TOK;
- case '?': if (in_brackets) return CHAR_TOK;
- return BIN_TOK;
- case BACK_SLASH:
- if (*input=='b' OR *input=='f' OR *input=='n'
- OR *input=='r' OR *input=='t') /* Std escape sequences */
- { ++input; return ESCAPE_TOK; }
- if (isoctal(*input) AND isoctal(*(input+1))
- AND isoctal(*(input+2))) /* Octal number */
- { input+=3; return ESCAPE_TOK; }
- ++input; /* Else, literal char */
- ++token;
- return CHAR_TOK;
- default:
- return CHAR_TOK;
- }
- }
-
- /* rx_emit.c */
- /* RX_EMIT.C -- Emitter routines for abstract syntax tree. */
- #include <rx.h>
- #define RX_STK_SIZE 40 /* Arbitrary limits. Should be */
- #define RX_NUM_NODES 200 /* enough for most cases, tho. */
- static RXNODE *rxstk [RX_STK_SIZE]; /* Semantic stack for reg expr. */
- static int idx = -1; /* Index into semantic stack. */
- static RXNODE rxnodearea [RX_NUM_NODES]; /* Stg for RXNODEs. */
- static RXNODE *rxnp = rxnodearea - 1; /* Pointer to last used node. */
- static RXNODE *rxnp_max = rxnodearea + RX_NUM_NODES; /* End of node area. */
- static RXNODE *make_rxnode (int, RXNODE *, RXNODE *); /* Func Decl. */
- int rx_emit (int nodetype)
- {
- char c;
- int i;
- RXNODE *np;
- switch (nodetype) {
- case RXNT_ROOT: rx_root = rxstk[idx];
- break;
- case RXNT_RX:
- case RXNT_STAR:
- case RXNT_PLUS:
- case RXNT_BIN:
- case RXNT_CLASS:
- case RXNT_NOT_CLASS:
- /* --- The single line of code below does it all! 1). Create a new
- node. 2). Pop the top node pointer off the semantic stack.
- 3). Attach it as the new node's left-node pointer. 4). Push
- pointer to new node onto the stack. */
- rxstk[idx] = make_rxnode (nodetype, rxstk[idx], RXNODE_0);
- break;
- case RXNT_RX_BEGIN:
- case RXNT_RX_END:
- case RXNT_RX_BOTH:
- rx_emit (RXNT_RX); /* Note node-type changed to RXNT_RX */
- if (nodetype == RXNT_RX_BEGIN
- OR nodetype == RXNT_RX_BOTH)
- rxstk[idx]->flag |= RX_BEGIN;
- if (nodetype == RXNT_RX_END
- OR nodetype == RXNT_RX_BOTH)
- rxstk[idx]->flag |= RX_END;
- break;
- case RXNT_ALT:
- case RXNT_TERM_LIST:
- case RXNT_CLASS_LIST:
- case RXNT_RANGE:
- case RXNT_SPAN_0:
- case RXNT_SPAN_1:
- rxstk[idx-1] = make_rxnode (nodetype, rxstk[idx-1], rxstk[idx]);
- --idx;
- break;
- case RXNT_SPAN_0_TO:
- case RXNT_SPAN_1_TO:
- rxstk[idx] = make_rxnode (nodetype, RXNODE_0, rxstk[idx]);
- break;
- case RXNT_SPAN_0_FROM:
- case RXNT_SPAN_1_FROM:
- rxstk[idx] = make_rxnode (nodetype, rxstk[idx], RXNODE_0);
- break;
- case RXNT_C:
- rxstk[++idx] = make_rxnode (nodetype, RXNODE_0, RXNODE_0);
- rxstk[idx]->c = *T_beg; /* Parser maintains "T_beg". */
- break;
- case RXNT_ANY_C:
- rxstk[++idx] = make_rxnode (nodetype, RXNODE_0, RXNODE_0);
- break;
- case RXNT_ESC_C: /* Note node type changed to RXNT_C. */
- np = rxstk[++idx] = make_rxnode (RXNT_C, RXNODE_0, RXNODE_0);
- c = *(T_beg+1);
- if (c == 'b') { np->c = '\b'; return 0; }
- if (c == 'f') { np->c = '\f'; return 0; }
- if (c == 'n') { np->c = '\n'; return 0; }
- if (c == 'r') { np->c = '\r'; return 0; }
- if (c == 't') { np->c = '\t'; return 0; }
- if (isoctal(*(T_beg+1))
- AND isoctal(*(T_beg+2))
- AND isoctal(*(T_beg+3))) {
- i = (*(T_beg+1) - '0') * 64;
- i += (*(T_beg+2) - '0') * 8;
- i += *(T_beg+3) - '0';
- np->c = i;
- }
- else
- np->c = *T_beg+1;
- break;
- default: exit (3); /* Something has gone wrong. */
- }
- if (idx >= RX_STK_SIZE) {
- printf ("Node-pointer stack overflow.\n");
- exit (4);
- }
- return 0;
- }
- static RXNODE *make_rxnode (int type, RXNODE *lnode, RXNODE *rnode)
- {
- if (++rxnp >= rxnp_max) { /* Alloc area for a new node */
- printf ("Node area overflow\n");
- exit (4);
- }
- rxnp->type = type;
- rxnp->lnode = lnode;
- rxnp->rnode = rnode;
- rxnp->c = rxnp->flag = 0;
- return (rxnp);
- }
-
- /* rx_match.c */
- #include <string.h>
- #include <rx.h>
- static int match_node (RXNODE *, char *); /* Local function */
- int rx_match (RXNODE *np, char *txt)
- {
- int i, j;
- for (i=0; txt[i]; i++) {
- j = match_node (np, txt+i);
- if (j >= 0) {
- rx_match_pos = i; /* A MATCH !!! */
- rx_match_len = j;
- return 1;
- }
- }
- rx_match_pos = -1; /* No match. */
- rx_match_len = -1;
- return 0;
- }
- static int match_node (RXNODE *np, char *txt)
- {
- int i, j, n, dot;
- if (np == (RXNODE *) 0) return 0;
- switch (np->type) {
- case RXNT_RX: /* Must match both sub-nodes. */
- if (np->flag & RX_BEGIN
- AND *(txt-1) != 0)
- return -1;
- i = match_node (np->lnode, txt);
- if (i<0) return -1;
- j = match_node (np->rnode, txt+i);
- if (j<0) return -1;
- if (np->flag & RX_END
- AND *(txt+i+j+1) != 0)
- return -1;
- return i+j;
- case RXNT_ALT: /* Match if either subnode matches. */
- i = match_node (np->lnode, txt);
- j = match_node (np->rnode, txt);
- return i>j ? i : j; /* Return the longer matching length. */
- case RXNT_TERM_LIST: /* Must match both subnodes. */
- i = match_node (np->lnode, txt);
- if (i<0) return -1;
- j = match_node (np->rnode, txt+i);
- if (j<0) return -1;
- return i+j;
- case RXNT_PLUS: /* Must match 1 or more occurance(s) of left-node. */
- i = match_node (np->lnode, txt);
- if (i<0) return -1;
- /* ------ Fall into RXNT_STAR ----- */
- case RXNT_STAR: /* Match 0 or more occurance(s) of left-node. */
- n = (np->type == RXNT_PLUS ? i : 0); /* Account for fall-in. */
- while (1) {
- i = match_node (np->lnode, txt+n);
- if (i<=0) return n;
- n += i;
- }
- case RXNT_BIN: /* Must match 0 or 1 occurance of left-node. */
- i = match_node (np->lnode, txt);
- return i<0 ? 0 : i;
- case RXNT_RANGE: /* Must be in range, inclusive. */
- return (*txt < np->lnode->c OR *txt > np->rnode->c) ? -1 : 1;
- case RXNT_CLASS: /* Must match left-node. */
- i = match_node (np->lnode, txt);
- return i>0 ? i : -1;
- case RXNT_CLASS_LIST: /* Must match either subnode. */
- i = match_node (np->lnode, txt);
- if (i>0) return i;
- j = match_node (np->rnode, txt);
- if (j>0) return j;
- return -1;
- case RXNT_NOT_CLASS: /* Must not match left-node. */
- i = match_node (np->lnode, txt);
- return i>0 ? -1 : 1;
- case RXNT_SPAN_0:
- case RXNT_SPAN_1:
- i = match_node (np->lnode, txt);
- if (i<0) return -1;
- if (i >= strlen(txt) AND np->type == RXNT_SPAN_1)
- return -1;
- dot = (np->type == RXNT_SPAN_1 ? 1 : 0);
- for (n=strlen(txt+i+dot); n>=0; n--) {
- j = match_node (np->rnode, txt+i+dot+n);
- if (j>=0) return i+dot+n+j;
- }
- return -1;
- case RXNT_SPAN_0_TO:
- case RXNT_SPAN_1_TO:
- dot = (np->type == RXNT_SPAN_1_TO ? 1 : 0);
- for (n=strlen(txt+dot); n>=0; n--) {
- i = match_node (np->rnode, txt+dot+n);
- if (i>=0) return dot+n+i;
- }
- return -1;
- case RXNT_SPAN_0_FROM:
- case RXNT_SPAN_1_FROM:
- i = match_node (np->lnode, txt);
- if (i<0) return -1;
- if (i >= strlen(txt) AND np->type == RXNT_SPAN_1_FROM)
- return -1;
- return strlen(txt);
- case RXNT_C: /* Must match the character. */
- return *txt == np->c ? 1 : -1;
- case RXNT_ANY_C: /* Matches anything except end-of-string. */
- return *txt ? 1 : -1;
- default: /* "I don't think we're in Kansas anymore, Toto." */
- exit (1);
- }
- }
-